{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入库和数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sys\n",
    "import seaborn as sns\n",
    "from time import sleep\n",
    "import matplotlib.pyplot as plt\n",
    "on_train=pd.read_csv(r'D:\\Data\\TCForNewComer\\ccf_online_stage1_train\\ccf_online_stage1_train.csv')\n",
    "off_train=pd.read_csv(r'D:\\Data\\TCForNewComer\\ccf_offline_stage1_train\\ccf_offline_stage1_train.csv')\n",
    "# oftid_train=pd.read_csv(r'D:\\Data\\TCForNewComer\\off_train_sameUser_id_test.csv')\n",
    "test=pd.read_csv(r'D:\\Data\\TCForNewComer\\ccf_offline_stage1_test_revised.csv')\n",
    "samplt=pd.read_csv(r'D:\\Data\\TCForNewComer\\sample_submission.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User_id</th>\n",
       "      <th>Merchant_id</th>\n",
       "      <th>Coupon_id</th>\n",
       "      <th>Discount_rate</th>\n",
       "      <th>Distance</th>\n",
       "      <th>Date_received</th>\n",
       "      <th>Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>1433</td>\n",
       "      <td>8735</td>\n",
       "      <td>30:5</td>\n",
       "      <td>10</td>\n",
       "      <td>20160214</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>1469</td>\n",
       "      <td>2902</td>\n",
       "      <td>0.95</td>\n",
       "      <td>10</td>\n",
       "      <td>20160607</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>35</td>\n",
       "      <td>3381</td>\n",
       "      <td>1807</td>\n",
       "      <td>300:30</td>\n",
       "      <td>0</td>\n",
       "      <td>20160130</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35</td>\n",
       "      <td>3381</td>\n",
       "      <td>9776</td>\n",
       "      <td>10:5</td>\n",
       "      <td>0</td>\n",
       "      <td>20160129</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35</td>\n",
       "      <td>3381</td>\n",
       "      <td>11951</td>\n",
       "      <td>200:20</td>\n",
       "      <td>0</td>\n",
       "      <td>20160129</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   User_id  Merchant_id Coupon_id Discount_rate Distance Date_received  Date\n",
       "0        4         1433      8735          30:5       10      20160214  null\n",
       "1        4         1469      2902          0.95       10      20160607  null\n",
       "2       35         3381      1807        300:30        0      20160130  null\n",
       "3       35         3381      9776          10:5        0      20160129  null\n",
       "4       35         3381     11951        200:20        0      20160129  null"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "off_train=off_train.sort_values(by=['User_id'])\n",
    "off_train.index=np.arange(0,len(off_train),1)\n",
    "off_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User_id</th>\n",
       "      <th>Merchant_id</th>\n",
       "      <th>Coupon_id</th>\n",
       "      <th>Discount_rate</th>\n",
       "      <th>Distance</th>\n",
       "      <th>Date_received</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4129537</td>\n",
       "      <td>450</td>\n",
       "      <td>9983</td>\n",
       "      <td>30:5</td>\n",
       "      <td>1</td>\n",
       "      <td>20160712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6949378</td>\n",
       "      <td>1300</td>\n",
       "      <td>3429</td>\n",
       "      <td>30:5</td>\n",
       "      <td>null</td>\n",
       "      <td>20160706</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2166529</td>\n",
       "      <td>7113</td>\n",
       "      <td>6928</td>\n",
       "      <td>200:20</td>\n",
       "      <td>5</td>\n",
       "      <td>20160727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2166529</td>\n",
       "      <td>7113</td>\n",
       "      <td>1808</td>\n",
       "      <td>100:10</td>\n",
       "      <td>5</td>\n",
       "      <td>20160727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6172162</td>\n",
       "      <td>7605</td>\n",
       "      <td>6500</td>\n",
       "      <td>30:1</td>\n",
       "      <td>2</td>\n",
       "      <td>20160708</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   User_id  Merchant_id  Coupon_id Discount_rate Distance Date_received\n",
       "0  4129537          450       9983          30:5        1      20160712\n",
       "1  6949378         1300       3429          30:5     null      20160706\n",
       "2  2166529         7113       6928        200:20        5      20160727\n",
       "3  2166529         7113       1808        100:10        5      20160727\n",
       "4  6172162         7605       6500          30:1        2      20160708"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.sort_values(by=['User_id'])\n",
    "test['Date_received']=test['Date_received'].astype(np.int64)\n",
    "test['Date_received']=test['Date_received'].apply(lambda x: str(x))\n",
    "test.index=np.arange(0,len(test),1)\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User_id</th>\n",
       "      <th>Merchant_id</th>\n",
       "      <th>Action</th>\n",
       "      <th>Coupon_id</th>\n",
       "      <th>Discount_rate</th>\n",
       "      <th>Date_received</th>\n",
       "      <th>Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>25104</td>\n",
       "      <td>2</td>\n",
       "      <td>100145044</td>\n",
       "      <td>100:10</td>\n",
       "      <td>20160331</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>45612</td>\n",
       "      <td>1</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>36</td>\n",
       "      <td>46701</td>\n",
       "      <td>0</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>64</td>\n",
       "      <td>11200</td>\n",
       "      <td>0</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>64</td>\n",
       "      <td>29214</td>\n",
       "      <td>0</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160606</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   User_id  Merchant_id  Action  Coupon_id Discount_rate Date_received  \\\n",
       "0        4        25104       2  100145044        100:10      20160331   \n",
       "1        4        45612       1       null          null          null   \n",
       "2       36        46701       0       null          null          null   \n",
       "3       64        11200       0       null          null          null   \n",
       "4       64        29214       0       null          null          null   \n",
       "\n",
       "       Date  \n",
       "0      null  \n",
       "1  20160308  \n",
       "2  20160120  \n",
       "3  20160526  \n",
       "4  20160606  "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "on_train=on_train.sort_values(by=['User_id'])\n",
    "on_train.index=np.arange(0,len(on_train),1)\n",
    "on_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User_id</th>\n",
       "      <th>Merchant_id</th>\n",
       "      <th>Action</th>\n",
       "      <th>Coupon_id</th>\n",
       "      <th>Discount_rate</th>\n",
       "      <th>Date_received</th>\n",
       "      <th>Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>25104</td>\n",
       "      <td>2</td>\n",
       "      <td>100145044</td>\n",
       "      <td>100:10</td>\n",
       "      <td>20160331</td>\n",
       "      <td>null</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>45612</td>\n",
       "      <td>1</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>36</td>\n",
       "      <td>46701</td>\n",
       "      <td>0</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>64</td>\n",
       "      <td>11200</td>\n",
       "      <td>0</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>null</td>\n",
       "      <td>20160526</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   User_id  Merchant_id  Action  Coupon_id Discount_rate Date_received  \\\n",
       "0        4        25104       2  100145044        100:10      20160331   \n",
       "1        4        45612       1       null          null          null   \n",
       "2       36        46701       0       null          null          null   \n",
       "3       64        11200       0       null          null          null   \n",
       "\n",
       "       Date  \n",
       "0      null  \n",
       "1  20160308  \n",
       "2  20160120  \n",
       "3  20160526  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "on_train.loc[[0,1,2,3]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "col='User_id'\n",
    "a=set(on_train[col].values)\n",
    "b=set(test[col].values)\n",
    "c=set(oftid_train[col].values)\n",
    "print(len(b))\n",
    "print(len(a))\n",
    "print(len(c))\n",
    "\n",
    "print(len(a&b))\n",
    "print(len(b&c))\n",
    "print(len(c-b))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 增加一列之前没用优惠券是否购买过相同商品\n",
    "- 可能是日常用品，或者零食\n",
    "- 可能是耐用品，然后买了之后就不会再买"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "indexs=oftid_train[(oftid_train['Coupon_id']=='null')&(oftid_train['Date']!='null')].index\n",
    "odinary=oftid_train.iloc[indexs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oftid_train.drop(indexs,axis=0,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test['ord_buy']=0#增加一列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test.index=np.arange(0,len(test),1)\n",
    "odinary.index=np.arange(0,len(odinary),1)\n",
    "oftid_train.index=np.arange(0,len(oftid_train),1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#计算同一件商品半年内普通购买次数\n",
    "from ipykernel import kernelapp as app\n",
    "i=0\n",
    "j=10\n",
    "length=len(test)\n",
    "for index in test.index:\n",
    "    rate=int(i*100/length)#看进度\n",
    "    if (rate in list(np.arange(10.,110.,10.)))&(rate==j):\n",
    "        print('.'*int(rate/10),rate,'%')\n",
    "        j=j+10\n",
    "\n",
    "    uid=test.iloc[index]['User_id']\n",
    "    mid=test.iloc[index]['Merchant_id']\n",
    "    if len(odinary[odinary['User_id']==uid].index)!=0:\n",
    "        for ind in odinary[odinary['User_id']==uid].index:\n",
    "            if odinary.iloc[ind]['Merchant_id']==mid:\n",
    "                test.loc[index,'ord_buy']+=1\n",
    "    i+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test.to_csv(r'D:\\Data\\TCForNewComer\\deal\\test_addordinarybuy.csv',index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 增加用券在线购买和普通购买、领取优惠券的列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "click_index=on_train[on_train['Action']==0].index\n",
    "on_train=on_train.drop(click_index,axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a=set(on_train['User_id'].values)\n",
    "b=set(test['User_id'].values)\n",
    "print(len(b))\n",
    "print(len(a))\n",
    "print(len(a&b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#删除on_train中的User_id不一样的行\n",
    "row=[]\n",
    "i=0\n",
    "j=10\n",
    "length=len(a&b)\n",
    "for i in (a&b):\n",
    "    rate=int(i*100/length)\n",
    "    if (rate in list(np.arange(10.,110.,10.)))&(rate==j):\n",
    "        print('.'*int(rate/10),rate,'%')\n",
    "        j=j+10\n",
    "    row.extend(list(off_train[off_train['User_id']==i].index))\n",
    "    i+=1\n",
    "on_train.drop(row,axis=0,inplace=True)\n",
    "\n",
    "on_train.to_csv(r'D:\\Data\\TCForNewComer\\deal\\on0_train_sameUser_id_test.csv',index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选取间隔日期大于15天，标记为负样本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p_train=off_train[(off_train['Coupon_id']!='null')&(off_train['Date']!='null')]#正样本\n",
    "n_train=off_train[(off_train['Coupon_id']!='null')&(off_train['Date']=='null')]#负样本\n",
    "\n",
    "#转换为时间格式\n",
    "p_train['Date_received']=pd.to_datetime(p_train['Date_received'])\n",
    "p_train['Date']=pd.to_datetime(p_train['Date'])\n",
    "n_train['Date_received']=pd.to_datetime(n_train['Date_received'])\n",
    "test['Date_received']=pd.to_datetime(test['Date_received'])\n",
    "\n",
    "p_train=p_train[((p_train['Date']-p_train['Date_received']).astype('timedelta64[D]'))<=15]#保留拿到和使用日期间隔小于15天的数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 找出正样本和负样本中的Merchant_id列和测试集相同的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p_train=p_train.sort_values(by=['User_id'])\n",
    "n_train=n_train.sort_values(by=['User_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "col='Merchant_id'\n",
    "p=set(p_train[col].values)\n",
    "t=set(test[col].values)\n",
    "n=set(n_train[col].values)\n",
    "print(len(p))\n",
    "print(len(n))\n",
    "print(len(t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p_t=list(p&t)\n",
    "n_t=list(n&t)\n",
    "print(len(p_t))\n",
    "print(len(n_t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p_tdf=pd.DataFrame()\n",
    "for i in p_t:\n",
    "    daf=p_train[p_train[col]==i]\n",
    "    p_tdf=pd.concat([p_tdf,daf],axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_tdf=pd.DataFrame()\n",
    "for i in n_t:\n",
    "    daf=n_train[n_train[col]==i]\n",
    "    n_tdf=pd.concat([n_tdf,daf],axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用普通购买的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "col='User_id'\n",
    "p=set(p_train[col].values)\n",
    "n=set(n_train[col].values)\n",
    "print(len(p))\n",
    "print(len(n))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p_tdf=pd.DataFrame()\n",
    "for i in (p|n):\n",
    "    daf=off_train[(off_train[col]==i)&(off_train['Date']!='')]\n",
    "    p_tdf=pd.concat([p_tdf,daf],axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 删除负样本中Coupon_id列的值不在测试集中的行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "n_tdf['Coupon_id']=n_tdf['Coupon_id'].astype(np.int64)\n",
    "test['Coupon_id']=test['Coupon_id'].astype(np.int64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "col='Coupon_id'\n",
    "bincol=[]*len(n_tdf)\n",
    "n1=set(n_tdf[col].values)\n",
    "t1=set(test[col].values)\n",
    "print(len(n1))\n",
    "print(len(t1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "no_exist_test=n1-t1\n",
    "for i in no_exist_test:\n",
    "    n_tdf.drop(n_tdf[n_tdf[col]==i].index,axis=0,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p_tdf['Date_received']=p_tdf['Date_received'].apply(lambda x: x.weekday()+1)#转换为星期几\n",
    "n_tdf['Date_received']=n_tdf['Date_received'].apply(lambda x: x.weekday()+1)\n",
    "test['Date_received']=test['Date_received'].apply(lambda x: x.weekday()+1)\n",
    "#正样本为1，负样本为0\n",
    "p_tdf.loc[:,'Date']=1\n",
    "n_tdf.loc[:,'Date']=0\n",
    "\n",
    "train=pd.concat([p_tdf,n_tdf],axis=0)\n",
    "train=train.sort_values(by=['User_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['Date_received']=train['Date_received'].apply(lambda x: 1 if x<=5 else 0)\n",
    "test['Date_received']=test['Date_received'].apply(lambda x: 1 if x<=5 else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train=train.replace('null',np.nan)\n",
    "test=test.replace('null',np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['Coupon_id']=train['Coupon_id'].astype(np.int64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['Merchant_id']=train['Merchant_id'].astype(np.object)\n",
    "train['User_id']=train['User_id'].astype(np.object)\n",
    "train['Date_received']=train['Date_received'].astype('category')\n",
    "test['Merchant_id']=test['Merchant_id'].astype(np.object)\n",
    "test['Coupon_id']=test['Coupon_id'].astype(np.object)\n",
    "test['Distance']=test['Distance'].astype(np.object)\n",
    "test['Date_received']=test['Date_received'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test['User_id']=test['User_id'].astype(np.object)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 转化满减为小数折扣"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Discount_rate(x):#转化函数\n",
    "    if x.startswith('0'):\n",
    "        return float(x)\n",
    "    else:\n",
    "        return round((float(x.split(':')[0])-float(x.split(':')[1]))/float(x.split(':')[0]),2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Discount_rate(x):#转化函数\n",
    "    if x.startswith('0'):\n",
    "        return float(x)\n",
    "    else:\n",
    "        return int(x.split(':')[0])-int(x.split(':')[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#将满减转化为小数\n",
    "train['Discount_rate']=train['Discount_rate'].apply(Discount_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填充训练集的空缺值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train=train.fillna(method='ffill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['Distance']=train['Distance'].astype(np.int64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.isnull().any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 填充测试集的空缺值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test=test.fillna(method='ffill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test['Discount_rate']=test['Discount_rate'].apply(Discount_rate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# test['Merchant_id']=test['Merchant_id'].astype(np.int64)\n",
    "# test['Coupon_id']=test['Coupon_id'].astype(np.int64)\n",
    "test['Distance']=test['Distance'].astype(np.int64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "test.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 删除没有购买行为的优惠券数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for j in list(set(train['Coupon_id'].values)):\n",
    "        df_1=train[train['Coupon_id']==j]\n",
    "        if len(np.unique(df_1['Date']))==1:\n",
    "            for i in train[train['Coupon_id']==j].index:\n",
    "                train['Coupon_id'].drop(i,axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#类别形变量转换为指示性变量\n",
    "col='Date_received'\n",
    "new_dummies=pd.get_dummies(train[col],prefix=col)\n",
    "train=pd.concat([train,new_dummies],axis=1)\n",
    "train.drop([col],axis=1,inplace=True)\n",
    "\n",
    "new_dummies=pd.get_dummies(test[col],prefix=col)\n",
    "test=pd.concat([test,new_dummies],axis=1)\n",
    "test.drop([col],axis=1,inplace=True)\n",
    "\n",
    "len(test.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 距离统计分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sns \n",
    "%matplotlib inline\n",
    "\n",
    "sns.countplot(x='Distance',hue='Date',data=train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "\n",
    "sns.countplot(x='Date_received',hue='Date',data=train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sns \n",
    "%matplotlib inline\n",
    "plt.figure(figsize=(10,6))\n",
    "sns.countplot(x='Discount_rate',hue='Date',data=train)\n",
    "plt.xticks(rotation=90)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a=set(train['Coupon_id'].values)\n",
    "c=set(test['Coupon_id'].values)\n",
    "len(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corr=train.corr()\n",
    "corr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "combine=[train,test]\n",
    "for data in combine:\n",
    "    data.drop(['Date_received'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a=set(train['Merchant_id'].values)\n",
    "c=set(test['Merchant_id'].values)\n",
    "len(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(a&c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "K=3\n",
    "from sklearn.metrics import roc_auc_score,auc\n",
    "from time import sleep\n",
    "import winsound\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "# kf = KFold(n_splits = K,random_state = 1,shuffle = True)\n",
    "kf=StratifiedKFold(n_splits = K,random_state = 90,shuffle = True)\n",
    "X=train.drop(['Date'],axis=1)\n",
    "y=train['Date']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 归一化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "x_predict=test\n",
    "# 1. min max scaler\n",
    "min_max_scaler = MinMaxScaler()\n",
    "x_predict_min_max_scaled = min_max_scaler.fit_transform(x_predict)\n",
    "\n",
    "# 2. standard scaler\n",
    "std_scaler = StandardScaler()\n",
    "x_predict_std_scaled = std_scaler.fit_transform(x_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 1. min max scaler\n",
    "min_max_scaler = MinMaxScaler()\n",
    "# x_train_min_max_scaled = min_max_scaler.fit_transform(x_train)\n",
    "# x_test_min_max_scaled = min_max_scaler.transform(x_test)\n",
    "X_test_min_max_scaled=min_max_scaler.fit_transform(X)#整个数据集\n",
    "# 2. standard scaler\n",
    "std_scaler = StandardScaler()\n",
    "# x_train_std_scaled = std_scaler.fit_transform(x_train)\n",
    "# x_test_std_scaled = std_scaler.transform(x_test)\n",
    "X_test_std_scaled=std_scaler.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "#全部  0.5552 learning_rate=0.09\n",
    "model=xgb.XGBClassifier(n_estimators=200,\n",
    "                        max_depth=2,#6\n",
    "                        objective=\"binary:logistic\",\n",
    "                        learning_rate=0.09, \n",
    "                        subsample=.8,\n",
    "                        min_child_weight=6,\n",
    "                        colsample_bytree=.4,\n",
    "                        scale_pos_weight=1.6,\n",
    "                        gamma=9,\n",
    "                        seed=100,\n",
    "                        reg_alpha=8,\n",
    "                        reg_lambda=1.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Avg_auc(pred):\n",
    "    aucc=0\n",
    "    i=0\n",
    "    df=train.copy()\n",
    "    df['y_valid_pred']=pred\n",
    "    for j in list(set(df['Coupon_id'].values)):\n",
    "        df_1=df[df['Coupon_id']==j]\n",
    "        if len(np.unique(df_1['Date']))==1:\n",
    "            continue\n",
    "        aucc=aucc+roc_auc_score(df_1['Date'],df_1['y_valid_pred'])\n",
    "        i+=1\n",
    "    aucc=aucc/i\n",
    "    df.drop(['y_valid_pred'],axis=1,inplace=True)\n",
    "    return aucc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "model = GaussianNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier# \n",
    "model=AdaBoostClassifier(n_estimators=400,learning_rate=1,random_state=100)\n",
    "model.fit(X,y)\n",
    "pred=model.predict_proba(X)[:,1]\n",
    "print('Avg_auc for full training set:',Avg_auc(pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "y_test_pred=0\n",
    "y_valid_pred=0*y\n",
    "fit_model=[]\n",
    "# XMIN=pd.DataFrame(X_test_std_scaled,columns=['Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received'])\n",
    "for i,(train_index,test_index) in enumerate(kf.split(X,y)):\n",
    "    y_train,y_valid=y.iloc[train_index],y.iloc[test_index]\n",
    "    x_train,x_valid=X.iloc[train_index],X.iloc[test_index]\n",
    "    print('\\nFlod:',i)\n",
    "    fitted_model=model.fit(x_train,y_train)\n",
    "    pred= fitted_model.predict_proba(x_valid)[:,1]\n",
    "    y_valid_pred.iloc[test_index]=pred\n",
    "    print(pd.DataFrame(pred).head(1))\n",
    "    \n",
    "    y_test_pred+=fitted_model.predict_proba(test)[:,1]\n",
    "    fit_model.append(fitted_model)\n",
    "    winsound.Beep(600,1000)\n",
    "    sleep(1)\n",
    "y_test_pred/=K\n",
    "print('Avg_auc for full training set:',Avg_auc(y_valid_pred))\n",
    "# sub=pd.DataFrame({'id':test_id,'target':y_test_pred})\n",
    "# sub.to_csv(r'D:\\Data\\Porto Seguro s Safe Driver Prediction\\result\\123456KFlod_submission.csv',float_format='%.4f',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train['a']=np.random.normal(len(y_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print('Avg_auc for full training set:',Avg_auc(y_valid_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import roc_auc_score\n",
    "y_true = np.array([0, 1, 0, 1])\n",
    "y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n",
    "y_true.astype(np.int64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
